{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "8375fdb6-aaba-4b82-a302-ac9f7ba658c2",
   "metadata": {},
   "source": [
    "# Lesson 3 - Recommender Systems"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f5edf276",
   "metadata": {},
   "source": [
    "### Import the Needed Packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bfc34589-9974-4e28-a77f-95d23be31a38",
   "metadata": {
    "height": 47
   },
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36fc834d-8d7c-4810-bc70-c99dbff741c4",
   "metadata": {
    "height": 182
   },
   "outputs": [],
   "source": [
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "from openai import OpenAI\n",
    "from pinecone import Pinecone, ServerlessSpec\n",
    "from tqdm.auto import tqdm, trange\n",
    "from DLAIUtils import Utils\n",
    "\n",
    "import pandas as pd\n",
    "import time\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64ba6310-6905-435f-945a-ebdd0b89415b",
   "metadata": {
    "height": 63
   },
   "outputs": [],
   "source": [
    "utils = Utils()\n",
    "PINECONE_API_KEY = utils.get_pinecone_api_key()\n",
    "OPENAI_API_KEY = utils.get_openai_api_key()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "044b8423",
   "metadata": {},
   "source": [
    "### Load the Dataset\n",
    "\n",
    "**Note:** To access the dataset outside of this course, just copy the following two lines of code and run it (remember to uncomment them first before executing):\n",
    "\n",
    "!wget -q --show-progress -O all-the-news-3.zip \"https://www.dropbox.com/scl/fi/wruzj2bwyg743d0jzd7ku/all-the-news-3.zip?rlkey=rgwtwpeznbdadpv3f01sznwxa&dl=1\"\n",
    "\n",
    "!unzip all-the-news-3.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d9c5082b-38e2-44c3-8267-9fdbdf4c9903",
   "metadata": {
    "height": 63
   },
   "outputs": [],
   "source": [
    "with open('./data/all-the-news-3.csv', 'r') as f:\n",
    "    header = f.readline()\n",
    "    print(header)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "da22b23e-a6ab-4e1c-89ca-64ab7bffbc28",
   "metadata": {
    "height": 46
   },
   "outputs": [],
   "source": [
    "df = pd.read_csv('./data/all-the-news-3.csv', nrows=99)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2f4fde52",
   "metadata": {},
   "source": [
    "### Setup Pinecone"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d2851fc4-a2fc-4252-a159-dde8fe44ffa1",
   "metadata": {
    "height": 233
   },
   "outputs": [],
   "source": [
    "openai_client = OpenAI(api_key=OPENAI_API_KEY)\n",
    "util = Utils()\n",
    "INDEX_NAME = utils.create_dlai_index_name('dl-ai')\n",
    "pinecone = Pinecone(api_key=PINECONE_API_KEY)\n",
    "\n",
    "if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:\n",
    "  pinecone.delete_index(INDEX_NAME)\n",
    "\n",
    "pinecone.create_index(name=INDEX_NAME, dimension=1536, metric='cosine',\n",
    "  spec=ServerlessSpec(cloud='aws', region='us-west-2'))\n",
    "\n",
    "index = pinecone.Index(INDEX_NAME)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eecf905b",
   "metadata": {},
   "source": [
    "### 1.  Create Embeddings of the News Titles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8fa42050-3ab2-4c5d-9a15-125447ebcaca",
   "metadata": {
    "height": 63
   },
   "outputs": [],
   "source": [
    "def get_embeddings(articles, model=\"text-embedding-ada-002\"):\n",
    "   return openai_client.embeddings.create(input = articles, model=model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a381e7e8-e4c9-43a1-9a54-5b7c04c29aff",
   "metadata": {
    "height": 301
   },
   "outputs": [],
   "source": [
    "CHUNK_SIZE=400\n",
    "TOTAL_ROWS=10000\n",
    "progress_bar = tqdm(total=TOTAL_ROWS)\n",
    "chunks = pd.read_csv('./data/all-the-news-3.csv', chunksize=CHUNK_SIZE, \n",
    "                     nrows=TOTAL_ROWS)\n",
    "chunk_num = 0\n",
    "for chunk in chunks:\n",
    "    titles = chunk['title'].tolist()\n",
    "    embeddings = get_embeddings(titles)\n",
    "    prepped = [{'id':str(chunk_num*CHUNK_SIZE+i), 'values':embeddings.data[i].embedding,\n",
    "                'metadata':{'title':titles[i]},} for i in range(0,len(titles))]\n",
    "    chunk_num = chunk_num + 1\n",
    "    if len(prepped) >= 200:\n",
    "      index.upsert(prepped)\n",
    "      prepped = []\n",
    "    progress_bar.update(len(chunk))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b372aa0-e700-4bd7-98e6-a931725657ca",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "index.describe_index_stats()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7b3cbe32",
   "metadata": {},
   "source": [
    "### Build the Recommender System"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bb458754-0a01-4fd1-95b2-64def8a14f39",
   "metadata": {
    "height": 97
   },
   "outputs": [],
   "source": [
    "def get_recommendations(pinecone_index, search_term, top_k=10):\n",
    "  embed = get_embeddings([search_term]).data[0].embedding\n",
    "  res = pinecone_index.query(vector=embed, top_k=top_k, include_metadata=True)\n",
    "  return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8543c5d6-dc51-4190-843f-2c4b0df43571",
   "metadata": {
    "height": 63
   },
   "outputs": [],
   "source": [
    "reco = get_recommendations(index, 'obama')\n",
    "for r in reco.matches:\n",
    "    print(f'{r.score} : {r.metadata[\"title\"]}')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6eb79a16",
   "metadata": {},
   "source": [
    "### 2.  Create Embeddings of All News Content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "902a05d3-b8f2-4726-8522-64ddeb02e958",
   "metadata": {
    "height": 131
   },
   "outputs": [],
   "source": [
    "if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:\n",
    "  pinecone.delete_index(name=INDEX_NAME)\n",
    "\n",
    "pinecone.create_index(name=INDEX_NAME, dimension=1536, metric='cosine',\n",
    "  spec=ServerlessSpec(cloud='aws', region='us-west-2'))\n",
    "articles_index = pinecone.Index(INDEX_NAME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "69059af0-2a17-417b-8d78-818ed218c672",
   "metadata": {
    "height": 165
   },
   "outputs": [],
   "source": [
    "def embed(embeddings, title, prepped, embed_num):\n",
    "  for embedding in embeddings.data:\n",
    "    prepped.append({'id':str(embed_num), 'values':embedding.embedding, 'metadata':{'title':title}})\n",
    "    embed_num += 1\n",
    "    if len(prepped) >= 100:\n",
    "        articles_index.upsert(prepped)\n",
    "        prepped.clear()\n",
    "  return embed_num"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "310bc783-9dd0-4961-a33e-eaa0a4e544b8",
   "metadata": {},
   "source": [
    "<p style=\"background-color:#fff1d7; padding:15px; \"> <b>(Note: <code>news_data_rows_num = 100</code>):</b> In this lab, we've initially set <code>news_data_rows_num</code> to 100 for speedier results, allowing you to observe the outcomes faster. Once you've done an initial run, consider increasing this value to 200, 400, 700, and 1000. You'll likely notice better and more relevant results.</p>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "682e44a9-6825-4d01-84af-3b91a1a2771b",
   "metadata": {
    "height": 335
   },
   "outputs": [],
   "source": [
    "news_data_rows_num = 100\n",
    "\n",
    "embed_num = 0 #keep track of embedding number for 'id'\n",
    "text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, \n",
    "    chunk_overlap=20) # how to chunk each article\n",
    "prepped = []\n",
    "df = pd.read_csv('./data/all-the-news-3.csv', nrows=news_data_rows_num)\n",
    "articles_list = df['article'].tolist()\n",
    "titles_list = df['title'].tolist()\n",
    "\n",
    "for i in range(0, len(articles_list)):\n",
    "    print(\".\",end=\"\")\n",
    "    art = articles_list[i]\n",
    "    title = titles_list[i]\n",
    "    if art is not None and isinstance(art, str):\n",
    "      texts = text_splitter.split_text(art)\n",
    "      embeddings = get_embeddings(texts)\n",
    "      embed_num = embed(embeddings, title, prepped, embed_num)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c524536-7625-49a7-b83a-bba1c0fb7848",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "articles_index.describe_index_stats()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "794940f9",
   "metadata": {},
   "source": [
    "### Build the Recommender System"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6fab67de-04c8-40dd-a1bc-34c928dab2ee",
   "metadata": {
    "height": 131
   },
   "outputs": [],
   "source": [
    "reco = get_recommendations(articles_index, 'obama', top_k=100)\n",
    "seen = {}\n",
    "for r in reco.matches:\n",
    "    title = r.metadata['title']\n",
    "    if title not in seen:\n",
    "        print(f'{r.score} : {title}')\n",
    "        seen[title] = '.'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f7dde0ea-30f7-413c-a0a2-56ec1e159e98",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
